import pandas as pd
file_path = r'D:\Bootcamp\Main\3_spotify_5000_songs.csv'
spotify5k_df = pd.read_csv(file_path)
spotify5k_df = spotify5k_df.rename(columns=lambda x: x.strip())
spotify5k_df.info()
spotify5k_df
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5235 entries, 0 to 5234 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 5235 non-null int64 1 name 5235 non-null object 2 artist 5235 non-null object 3 danceability 5235 non-null float64 4 energy 5235 non-null float64 5 key 5235 non-null int64 6 loudness 5235 non-null float64 7 mode 5235 non-null int64 8 speechiness 5235 non-null float64 9 acousticness 5235 non-null float64 10 instrumentalness 5235 non-null float64 11 liveness 5235 non-null float64 12 valence 5235 non-null float64 13 tempo 5235 non-null float64 14 type 5235 non-null object 15 duration_ms 5235 non-null int64 16 time_signature 5235 non-null int64 17 id 5235 non-null object 18 html 5235 non-null object dtypes: float64(9), int64(5), object(5) memory usage: 777.2+ KB
| Unnamed: 0 | name | artist | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | type | duration_ms | time_signature | id | html | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Se Eu Quiser Falar Com Deus ... | Gilberto Gil | 0.6580 | 0.25900 | 11 | -13.141 | 0 | 0.0705 | 0.694 | 0.000059 | 0.9750 | 0.3060 | 110.376 | 256213 | 4 | 1n7JnwviZ7zf0LR1tcGFq7 | https://open.spotify.com/track/1n7JnwviZ7zf0LR... | |
| 1 | 1 | Saudade De Bahia ... | Antônio Carlos Jobim | 0.7420 | 0.39900 | 2 | -12.646 | 1 | 0.0346 | 0.217 | 0.000002 | 0.1070 | 0.6930 | 125.039 | 191867 | 4 | 5QGM1U0eCYrQuwSJwTm5Zq | https://open.spotify.com/track/5QGM1U0eCYrQuwS... | |
| 2 | 2 | Canta Canta, Minha Gente ... | Martinho Da Vila | 0.8510 | 0.73000 | 2 | -11.048 | 1 | 0.3470 | 0.453 | 0.000063 | 0.1240 | 0.9050 | 93.698 | 152267 | 4 | 0NLIFSZxPzQhCwnkn5PJYs | https://open.spotify.com/track/0NLIFSZxPzQhCwn... | |
| 3 | 3 | Mulher Eu Sei ... | Chico César | 0.7050 | 0.05020 | 4 | -18.115 | 1 | 0.0471 | 0.879 | 0.000041 | 0.3860 | 0.5240 | 106.802 | 186227 | 4 | 3mXqOdlLE1k67WsAxryPFs | https://open.spotify.com/track/3mXqOdlLE1k67Ws... | |
| 4 | 4 | Rosa Morena ... | Kurt Elling | 0.6510 | 0.11900 | 6 | -19.807 | 1 | 0.0380 | 0.916 | 0.000343 | 0.1040 | 0.4020 | 120.941 | 273680 | 4 | 7bSzjzjTkWT2CkIPPdp0eA | https://open.spotify.com/track/7bSzjzjTkWT2CkI... | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5230 | 5230 | 1812 Festival Overture, Op. 49: 1812 Overture,... | Pyotr Ilyich Tchaikovsky | 0.2020 | 0.06940 | 3 | -23.390 | 1 | 0.0473 | 0.917 | 0.715000 | 0.0832 | 0.0655 | 87.906 | 995000 | 4 | 1aEhYlZtypmipA06SDJ4U3 | https://open.spotify.com/track/1aEhYlZtypmipA0... | |
| 5231 | 5231 | Winter Fragments pour ensemble instrumental, s... | Tristan Murail | 0.1880 | 0.10100 | 6 | -21.873 | 1 | 0.0442 | 0.855 | 0.426000 | 0.0694 | 0.0355 | 83.134 | 855000 | 4 | 1Gfqe7TAKklmuZf6hxsH6h | https://open.spotify.com/track/1Gfqe7TAKklmuZf... | |
| 5232 | 5232 | Schoenberg: 5 Orchestral Pieces, Op. 16: No. 3... | Arnold Schoenberg | 0.0596 | 0.00093 | 9 | -42.959 | 1 | 0.0434 | 0.951 | 0.969000 | 0.0772 | 0.0344 | 71.573 | 238187 | 3 | 2XNwnFrdMDpismp0VUZ7cU | https://open.spotify.com/track/2XNwnFrdMDpismp... | |
| 5233 | 5233 | Serenade For Strings In E, Op.22, B. 52: 1. Mo... | Antonín Dvořák | 0.1330 | 0.02080 | 4 | -29.443 | 1 | 0.0419 | 0.883 | 0.505000 | 0.1110 | 0.0591 | 67.109 | 314307 | 4 | 7ucDwgMtE3YJtEfTbuRhy0 | https://open.spotify.com/track/7ucDwgMtE3YJtEf... | |
| 5234 | 5234 | Ravel: Boléro, M. 81 ... | Maurice Ravel | 0.3540 | 0.04870 | 5 | -24.568 | 1 | 0.0323 | 0.783 | 0.484000 | 0.2840 | 0.2530 | 64.120 | 948787 | 3 | 7E1ErYYCn0lYjHODZ1qGuB | https://open.spotify.com/track/7E1ErYYCn0lYjHO... |
5235 rows × 19 columns
spotify5k_df.axes
[RangeIndex(start=0, stop=5235, step=1),
Index(['Unnamed: 0', 'name', 'artist', 'danceability', 'energy', 'key',
'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo', 'type', 'duration_ms', 'time_signature',
'id', 'html'],
dtype='object')]
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = spotify5k_df[features]
wcss = []
k_range = range(1, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Curve for KMeans Clustering')
plt.xticks(k_range)
plt.show()
wcss_data = pd.DataFrame({'Number of Clusters (k)': k_range, 'WCSS': wcss})
print("WCSS Data:")
wcss_data
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
WCSS Data:
| Number of Clusters (k) | WCSS | |
|---|---|---|
| 0 | 1 | 4.481778e+06 |
| 1 | 2 | 1.744851e+06 |
| 2 | 3 | 8.550621e+05 |
| 3 | 4 | 5.981134e+05 |
| 4 | 5 | 4.404780e+05 |
| 5 | 6 | 3.650132e+05 |
| 6 | 7 | 3.213276e+05 |
| 7 | 8 | 2.811246e+05 |
| 8 | 9 | 2.543345e+05 |
| 9 | 10 | 2.315212e+05 |
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer
# Define the features
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
# Extract features from the dataframe (assuming spotify5k_df is defined elsewhere)
# X = spotify5k_df[features]
# Number of clusters
k = 4
# List of scaler names for plotting and labeling
scaler_names = ['Raw', 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'PowerTransformer']
# Dictionary to store Within-Cluster-Sum-of-Squares (WCSS) for each scaler
wcss_dict = {}
# Define colors for better readability
colors = sns.color_palette("tab10")
# Dictionary to store cluster centers for each scaler
cluster_centers = {}
# Iterate over each scaler
for i, scaler in enumerate([None, StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler(), PowerTransformer()]):
# Scale the features if scaler is not None
if scaler is not None:
X_scaled = scaler.fit_transform(X)
scaler_name = scaler_names[i]
else:
X_scaled = X
scaler_name = 'Raw Data'
# Initialize KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
# Calculate WCSS
wcss = kmeans.inertia_
wcss_dict[scaler_name] = wcss
# Store cluster centers
cluster_centers[scaler_name] = kmeans.cluster_centers_
# Plot radar chart for cluster centers
plt.figure(figsize=(10, 6))
angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False).tolist()
angles += angles[:1] # Ensure closed loop
ax = plt.subplot(111, polar=True)
# Plot each cluster's centroid
for idx, center in enumerate(kmeans.cluster_centers_):
values = np.round(np.concatenate((center, [center[0]])), 2) # Round to two decimal points
ax.plot(angles, values, marker='o', linestyle='-', color=colors[idx], linewidth=2, label=f'Cluster {idx+1}')
# Fill the area enclosed by each cluster's centroid with a light blue shade
ax.fill(angles, values, color=colors[idx], alpha=0.25)
# Set the labels for each axis
ax.set_yticklabels([])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(features, fontsize=10)
plt.title(f'Radar Chart for Cluster Centers ({scaler_name})', loc='left', fontsize=12, pad=20)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=k)
plt.show()
# Create DataFrame to display WCSS for the current scaler
scaler_df = pd.DataFrame(list(wcss_dict.items()), columns=['Scaler', 'WCSS'])
# Display the table for the current scaler
print(f"{scaler_name} chart")
print(f"Table for {scaler_name}:")
display(scaler_df)
# Find the best scaler based on the minimum WCSS
best_scaler = min(wcss_dict, key=wcss_dict.get)
print(f"\nBased on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is {best_scaler}. Using this scaler helps in minimizing the WCSS, indicating better cluster formation.")
print(f"\nChoosing an appropriate scaler is crucial as it affects the clustering results. With {k} clusters, it is recommended to use the {best_scaler} scaler to create the clusters.")
# Add explanation points based on the WCSS score
print("\nAdditional points: \n1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. \n2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. \n3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.")
Raw Data chart Table for Raw Data:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
StandardScaler chart Table for StandardScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
MinMaxScaler chart Table for MinMaxScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
RobustScaler chart Table for RobustScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
MaxAbsScaler chart Table for MaxAbsScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
PowerTransformer chart Table for PowerTransformer:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
| 5 | PowerTransformer | 24975.059530 |
Based on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is MinMaxScaler. Using this scaler helps in minimizing the WCSS, indicating better cluster formation. Choosing an appropriate scaler is crucial as it affects the clustering results. With 4 clusters, it is recommended to use the MinMaxScaler scaler to create the clusters. Additional points: 1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. 2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. 3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = spotify5k_df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
wcss = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()
elbow_df = pd.DataFrame({'Number of Clusters': range(1, 11), 'WCSS': wcss})
elbow_df
silhouette_scores = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal Number of Clusters')
plt.show()
silhouette_df = pd.DataFrame({'Number of Clusters': range(2, 11), 'Silhouette Score': silhouette_scores})
silhouette_df
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
| Number of Clusters | Silhouette Score | |
|---|---|---|
| 0 | 2 | 0.333466 |
| 1 | 3 | 0.270886 |
| 2 | 4 | 0.276123 |
| 3 | 5 | 0.267038 |
| 4 | 6 | 0.247665 |
| 5 | 7 | 0.215942 |
| 6 | 8 | 0.213132 |
| 7 | 9 | 0.196396 |
| 8 | 10 | 0.187944 |
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()
plt.plot(range(1, len(explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by Principal Components')
plt.grid(False)
plt.show()
num_components = 5
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)
labels = kmeans.labels_
spotify5k_df['Cluster'] = labels
cluster_counts = spotify5k_df['Cluster'].value_counts()
print("Cluster Counts:\n", cluster_counts)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Cluster Counts: Cluster 2 2602 3 1343 0 878 1 412 Name: count, dtype: int64
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(spotify5k_df[features])
kmeans = KMeans(n_clusters=4, random_state=42)
spotify5k_df['Cluster'] = kmeans.fit_predict(X)
numeric_columns = spotify5k_df.select_dtypes(include=['float64', 'int64'])
cluster_means = spotify5k_df.groupby('Cluster')[numeric_columns.columns].mean()
print("Cluster Characteristics (Mean Feature Values):\n")
cluster_means
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=spotify5k_df['Cluster'], cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clusters of Songs')
plt.colorbar(label='Cluster')
plt.show()
for feature in features:
plt.figure(figsize=(8, 6))
for cluster_id in range(4):
cluster_data = spotify5k_df[spotify5k_df['Cluster'] == cluster_id][feature]
plt.hist(cluster_data, bins=20, alpha=0.6, label=f'Cluster {cluster_id}')
plt.title(f'Distribution of {feature} by Cluster')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.legend()
plt.show()
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Cluster Characteristics (Mean Feature Values):
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, spotify5k_df['Cluster'], test_size=0.2, random_state=42)
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.9684813753581661
spotify5k_df.dtypes
Unnamed: 0 int64 name object artist object danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 type object duration_ms int64 time_signature int64 id object html object Cluster int32 dtype: object
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Define a function to normalize tempo
def normalize_tempo(tempo):
max_tempo = spotify5k_df['tempo'].max()
min_tempo = spotify5k_df['tempo'].min()
return (tempo - min_tempo) / (max_tempo - min_tempo)
# Apply the normalization function to the tempo column
spotify5k_df['normalized_tempo'] = spotify5k_df['tempo'].apply(normalize_tempo)
cluster_names = {
0: 'Serene Sounds',
1: 'Pulsating Rhythms',
2: 'Tranquil Tunes',
3: 'Melancholic Melodies',
}
# Use 'normalized_tempo' instead of 'tempo' in cluster statistics
cluster_statistics = spotify5k_df.groupby('Cluster').agg({
'danceability': 'mean',
'energy': 'mean',
'valence': 'mean',
'normalized_tempo': 'mean', # Use 'normalized_tempo' instead of 'tempo'
'acousticness': 'mean',
'speechiness': 'mean'
})
cluster_explanations = {
0: f"Songs with serene and calming vibes characterized by high valence ({cluster_statistics.loc[0, 'valence']:.2f}) and moderate tempo ({cluster_statistics.loc[0, 'normalized_tempo']:.2f}).",
1: f"Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy ({cluster_statistics.loc[1, 'energy']:.2f}) and tempo ({cluster_statistics.loc[1, 'normalized_tempo']:.2f}).",
2: f"Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence ({cluster_statistics.loc[2, 'valence']:.2f}) and tempo ({cluster_statistics.loc[2, 'normalized_tempo']:.2f}).",
3: f"Melancholic tunes with low valence ({cluster_statistics.loc[3, 'valence']:.2f}) and a somber atmosphere, often featuring high acousticness ({cluster_statistics.loc[3, 'acousticness']:.2f}).",
}
for cluster_id, name in cluster_names.items():
print(f"Cluster {cluster_id} ({name}): {cluster_explanations[cluster_id]}")
display(cluster_statistics.loc[[cluster_id]])
plt.figure(figsize=(10, 6))
sns.set(style="white") # Remove grid
sns.barplot(x=cluster_statistics.columns, y=cluster_statistics.loc[cluster_id].values, palette="magma")
plt.title(f'Cluster {cluster_id} - {name} Features')
plt.xticks(rotation=45)
for index, value in enumerate(cluster_statistics.loc[cluster_id]):
plt.text(index, value, f'{value:.2f}', ha='center', va='bottom')
plt.show()
print('\n')
Cluster 0 (Serene Sounds): Songs with serene and calming vibes characterized by high valence (0.61) and moderate tempo (0.56).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 0 | 0.645645 | 0.684244 | 0.611999 | 0.564491 | 0.236267 | 0.056894 |
Cluster 1 (Pulsating Rhythms): Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy (0.15) and tempo (0.49).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 1 | 0.354487 | 0.146981 | 0.2049 | 0.491445 | 0.913782 | 0.045184 |
Cluster 2 (Tranquil Tunes): Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence (0.58) and tempo (0.58).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 2 | 0.683221 | 0.738124 | 0.579874 | 0.577067 | 0.17778 | 0.267501 |
Cluster 3 (Melancholic Melodies): Melancholic tunes with low valence (0.23) and a somber atmosphere, often featuring high acousticness (0.01).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 3 | 0.288293 | 0.904954 | 0.225855 | 0.57095 | 0.014879 | 0.103418 |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 10))
for cluster_id, name in cluster_names.items():
plt.scatter(cluster_statistics.loc[cluster_id, 'normalized_tempo'],
cluster_statistics.loc[cluster_id, 'valence'],
label=name,
s=200,
alpha=0.7)
plt.text(cluster_statistics.loc[cluster_id, 'normalized_tempo'],
cluster_statistics.loc[cluster_id, 'valence'],
f"{name}\n{cluster_explanations[cluster_id]}",
fontsize=10,
ha='center',
va='center',
wrap=True)
plt.xlabel('Normalized Tempo', fontsize=12)
plt.ylabel('Valence', fontsize=12)
plt.title('Cluster Analysis based on Tempo and Valence', fontsize=14)
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=2, fontsize=10)
plt.grid(False)
plt.margins(0.05)
plt.show()
# Explanation
print("\nExplanation:")
print("We chose to plot tempo and valence as they are two key features that determine the mood of a song.")
print("Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content.")
print("By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics.")
# Conclusion
print("\nConclusion:")
print("Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics.")
print("Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.")
Explanation: We chose to plot tempo and valence as they are two key features that determine the mood of a song. Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content. By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics. Conclusion: Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics. Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
spotify5k_df['text'] = spotify5k_df['name'] + ' ' + spotify5k_df['artist']
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(spotify5k_df['text'])
vocab = vectorizer.get_feature_names_out()
word_counts = pd.DataFrame(X.toarray(), columns=vocab)
word_counts['Cluster'] = spotify5k_df['Cluster']
cluster_names = {
0: 'Relaxing Vibes',
1: 'Energetic Beats',
2: 'Chill Out',
3: 'Melancholic Melodies'
}
for cluster_id in range(len(word_counts['Cluster'].unique())):
words_in_cluster = word_counts[word_counts['Cluster'] == cluster_id].drop('Cluster', axis=1)
word_freq = words_in_cluster.sum().to_dict()
wordcloud = WordCloud(width=800, height=400, background_color='white', prefer_horizontal=0.9).generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
plt.axis('off')
plt.show()
top_words = words_in_cluster.sum().sort_values(ascending=False).head(10)
sns.set_palette('bright')
plt.figure(figsize=(10, 6))
sns.barplot(x=top_words.values, y=top_words.index)
plt.title(f'Top 10 Words in {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
for i, (word, count) in enumerate(zip(top_words.index, top_words.values)):
plt.text(count, i, f' {word} ({count})', fontsize=10, style='italic', va='center')
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist
from warnings import simplefilter
# Ignore future warnings
simplefilter(action='ignore', category=FutureWarning)
# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo']
# Scale the features
try:
scaler = StandardScaler()
X = scaler.fit_transform(spotify5k_df[features])
except KeyError:
print("Error: Features not found in the dataset.")
# Use PCA to determine the number of clusters
try:
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X)
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)
except ValueError:
print("Error: Not enough features for PCA.")
# Find optimal number of clusters using silhouette score
best_score = -1
best_k = -1
for k in range(2, 11):
try:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca)
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
if silhouette_avg > best_score:
best_score = silhouette_avg
best_k = k
except ValueError:
print("Error: Unable to find optimal number of clusters.")
# Train KMeans with optimal number of clusters
try:
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(X_pca)
spotify5k_df['cluster'] = kmeans.labels_
except ValueError:
print("Error: Unable to train KMeans model.")
# Function to calculate diversity of recommended songs
def calculate_diversity(recommended_songs):
try:
recommended_features = recommended_songs[features].to_numpy()
cosine_distances = pdist(recommended_features, metric='cosine')
avg_cosine_distance = np.mean(cosine_distances)
diversity = 1 - avg_cosine_distance
return diversity
except KeyError:
print("Error: Features not found in recommended songs.")
# Function to recommend songs from a given cluster
def recommend_songs(cluster_id, num_songs=5):
try:
cluster_data = spotify5k_df[spotify5k_df['cluster'] == cluster_id]
recommended_songs = cluster_data.sample(min(num_songs, len(cluster_data)))
return recommended_songs
except KeyError:
print("Error: Cluster ID not found.")
# Visualize clusters
def visualize_clusters(X_pca, labels, centroids):
plt.figure(figsize=(10, 6))
for i in range(len(np.unique(labels))):
plt.scatter(X_pca[labels == i, 0], X_pca[labels == i, 1], label=f'Cluster {i}')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='black', label='Centroids')
plt.title('Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
# Visualize explained variance ratio
def visualize_variance(pca):
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance Ratio')
plt.grid(True)
plt.show()
# Visualize silhouette scores
def visualize_silhouette_scores(scores):
plt.figure(figsize=(8, 6))
plt.plot(range(2, 11), scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()
# Example: Recommend songs from each cluster and calculate diversity
try:
print("**Playlist Recommendations and Diversity Scores:**")
playlists = []
for cluster_id in range(best_k):
recommended_songs = recommend_songs(cluster_id)
playlist_name = f"Playlist {cluster_id + 1}: {', '.join(recommended_songs['name'].tolist())}"
playlists.append({'Name': playlist_name, 'Diversity': calculate_diversity(recommended_songs)})
playlists_df = pd.DataFrame(playlists)
display(playlists_df)
except TypeError:
print("Error: Unable to calculate diversity.")
# Visualizations
print("\n**Visualizations:**")
visualize_clusters(X_pca, kmeans.labels_, kmeans.cluster_centers_)
visualize_variance(pca)
visualize_silhouette_scores([silhouette_score(X_pca, kmeans.labels_) for k in range(2, 11)])
# Answers to questions
print("\n**Answers to Questions:**")
print("\n**How did you create your prototype?**")
print("The prototype was created using Python with the scikit-learn library for machine learning algorithms.\n")
print("**How many playlists (clusters) are there?**")
print("The number of playlists (clusters) is determined dynamically based on the data using the silhouette score.\n")
print("**What audio features did you use and what did you drop? Why?**")
print("We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics.")
print("We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'.\n")
print("**Is the prototype effective at creating cohesive playlists?**")
print("Cohesiveness of playlists can be evaluated based on diversity and user feedback.\n")
print("**Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?**")
print("This can be determined through user feedback and comparison with manually curated playlists.\n")
print("**What kind of data might help us create better playlists?**")
print("Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality.\n")
print("**Is K-Means a good method for creating playlists? Provide pros and cons.**")
print("Pros:")
print("- Simple and easy to implement.")
print("- Scalable to large datasets.")
print("\nCons:")
print("- Assumes clusters are spherical and of equal size.")
print("- Sensitive to initialization.\n")
print("**What would be your next steps if you continued with this project?**")
print("Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.")
**Playlist Recommendations and Diversity Scores:**
| Name | Diversity | |
|---|---|---|
| 0 | Playlist 1: Spawn of Flesh ... | 0.999342 |
| 1 | Playlist 2: Ui! (Voce Inventa) ... | 0.991936 |
**Visualizations:**
**Answers to Questions:** **How did you create your prototype?** The prototype was created using Python with the scikit-learn library for machine learning algorithms. **How many playlists (clusters) are there?** The number of playlists (clusters) is determined dynamically based on the data using the silhouette score. **What audio features did you use and what did you drop? Why?** We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics. We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'. **Is the prototype effective at creating cohesive playlists?** Cohesiveness of playlists can be evaluated based on diversity and user feedback. **Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?** This can be determined through user feedback and comparison with manually curated playlists. **What kind of data might help us create better playlists?** Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality. **Is K-Means a good method for creating playlists? Provide pros and cons.** Pros: - Simple and easy to implement. - Scalable to large datasets. Cons: - Assumes clusters are spherical and of equal size. - Sensitive to initialization. **What would be your next steps if you continued with this project?** Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Perform sentiment analysis on the 'name' column and handle NaNs
spotify5k_df['sentiment_score'] = spotify5k_df['name'].fillna('').apply(lambda x: sia.polarity_scores(x)['compound'])
# Fit a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(spotify5k_df['name'])
# Find optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)
silhouette_scores.append(silhouette_score(X, cluster_labels))
optimal_clusters_silhouette = silhouette_scores.index(max(silhouette_scores)) + 2
# Apply PCA to visualize clusters
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())
# Find optimal number of clusters using PCA
inertia = []
for n_clusters in range(2, 11):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_pca)
inertia.append(kmeans.inertia_)
optimal_clusters_pca = inertia.index(min(inertia)) + 2
# Cluster using KMeans with optimal number of clusters
kmeans_silhouette = KMeans(n_clusters=optimal_clusters_silhouette, random_state=42)
spotify5k_df['cluster_silhouette'] = kmeans_silhouette.fit_predict(X)
kmeans_pca = KMeans(n_clusters=optimal_clusters_pca, random_state=42)
spotify5k_df['cluster_pca'] = kmeans_pca.fit_predict(X)
# Define cluster names based on sentiment
sentiment_cluster_names = {
0: 'Negative (Low Sentiment)',
1: 'Neutral (Medium Sentiment)',
2: 'Positive (High Sentiment)'
}
# Assign cluster names
spotify5k_df['cluster_silhouette'] = spotify5k_df['cluster_silhouette'].map(sentiment_cluster_names)
spotify5k_df['cluster_pca'] = spotify5k_df['cluster_pca'].map(sentiment_cluster_names)
# Explanation for choosing the number of clusters
explanation = f"The number of clusters chosen based on silhouette score: {optimal_clusters_silhouette}. " \
f"The number of clusters chosen based on PCA: {optimal_clusters_pca}."
# Visualize clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=spotify5k_df['cluster_pca'], palette='husl', legend='full', marker='o')
plt.title('PCA Visualization of Song Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()
# Visualize clusters using Silhouette Score
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(2, 11), y=silhouette_scores, marker='o', color='blue')
plt.title('Silhouette Score for Optimal Cluster Selection')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(False)
plt.xticks(range(2, 11))
plt.show()
# Additional visualizations
plt.figure(figsize=(10, 6))
sns.histplot(data=spotify5k_df, x='sentiment_score', bins=30, kde=True, color='green')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', palette='husl')
plt.title('Number of Songs in Each Sentiment Cluster')
plt.xlabel('Sentiment Cluster')
plt.ylabel('Count')
plt.grid(False)
plt.show()
# Output DataFrame with cluster assignments and remove NaNs
cluster_silhouette_df = spotify5k_df[['name', 'cluster_silhouette']].dropna()
cluster_pca_df = spotify5k_df[['name', 'cluster_pca']].dropna()
# Output sentiment score for each cluster
sentiment_scores = spotify5k_df.groupby('cluster_pca')['sentiment_score'].mean()
# Explanation for cluster names
cluster_name_explanation = f"Cluster names are based on sentiment score: " \
f"Negative (Low Sentiment): sentiment score < 0, " \
f"Neutral (Medium Sentiment): sentiment score ≈ 0, " \
f"Positive (High Sentiment): sentiment score > 0."
# Display the outputs
print(explanation)
print(cluster_name_explanation)
display(cluster_silhouette_df)
display(cluster_pca_df)
display(sentiment_scores)
The number of clusters chosen based on silhouette score: 10. The number of clusters chosen based on PCA: 10. Cluster names are based on sentiment score: Negative (Low Sentiment): sentiment score < 0, Neutral (Medium Sentiment): sentiment score ≈ 0, Positive (High Sentiment): sentiment score > 0.
| name | cluster_silhouette | |
|---|---|---|
| 10 | The Girl From Ipanema ... | Positive (High Sentiment) |
| 18 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 21 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 23 | Don't Stop the Carnival ... | Positive (High Sentiment) |
| 28 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| ... | ... | ... |
| 5197 | The Unanswered Question ... | Positive (High Sentiment) |
| 5201 | The Planets - Suite for large orchestra, Op.32... | Positive (High Sentiment) |
| 5213 | The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... | Positive (High Sentiment) |
| 5226 | Pines Of Rome, P. 141: 3. The Pines Of The Jan... | Positive (High Sentiment) |
| 5229 | A Flock Descends Into The Pentagonal Garden ... | Positive (High Sentiment) |
621 rows × 2 columns
| name | cluster_pca | |
|---|---|---|
| 10 | The Girl From Ipanema ... | Positive (High Sentiment) |
| 18 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 21 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 23 | Don't Stop the Carnival ... | Positive (High Sentiment) |
| 28 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| ... | ... | ... |
| 5197 | The Unanswered Question ... | Positive (High Sentiment) |
| 5201 | The Planets - Suite for large orchestra, Op.32... | Positive (High Sentiment) |
| 5213 | The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... | Positive (High Sentiment) |
| 5226 | Pines Of Rome, P. 141: 3. The Pines Of The Jan... | Positive (High Sentiment) |
| 5229 | A Flock Descends Into The Pentagonal Garden ... | Positive (High Sentiment) |
621 rows × 2 columns
cluster_pca Negative (Low Sentiment) 0.114416 Neutral (Medium Sentiment) 0.072395 Positive (High Sentiment) -0.044409 Name: sentiment_score, dtype: float64
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Perform sentiment analysis on the objective columns and handle NaNs
objective_columns = ['danceability', 'energy', 'key', 'loudness', 'mode',
'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo']
spotify5k_df['objective_sentiment_score'] = spotify5k_df[objective_columns].fillna('').apply(
lambda x: sia.polarity_scores(str(x))['compound'])
# Fit a PCA to visualize objective clusters
X_objective = spotify5k_df[objective_columns].fillna(0) # Fill NaNs with 0 for PCA
pca_objective = PCA(n_components=2, random_state=42)
X_pca_objective = pca_objective.fit_transform(X_objective)
# Cluster using KMeans with optimal number of clusters
kmeans_objective = KMeans(n_clusters=3, random_state=42)
spotify5k_df['objective_cluster'] = kmeans_objective.fit_predict(X_objective)
# Define cluster names based on objective sentiment
objective_cluster_names = {
0: 'Low',
1: 'Medium',
2: 'High'
}
# Assign cluster names
spotify5k_df['objective_cluster'] = spotify5k_df['objective_cluster'].map(objective_cluster_names)
# Explanation for objective sentiment analysis
objective_explanation = "Objective sentiment analysis was performed based on the following columns: " \
"'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', " \
"'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. " \
"Three clusters were chosen for better interpretation: Low, Medium, and High."
# Visualize objective clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca_objective[:, 0], y=X_pca_objective[:, 1],
hue=spotify5k_df['objective_cluster'], palette='husl',
legend='full', marker='o')
plt.title('PCA Visualization of Objective Sentiment Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()
# Compare with sentiment analysis based on 'name' column
plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', hue='objective_cluster', palette='husl')
plt.title('Comparison of Sentiment Clusters (Name vs Objective Columns)')
plt.xlabel('Sentiment Cluster (Name)')
plt.ylabel('Count')
plt.legend(title='Objective Cluster', loc='upper right')
plt.grid(False)
plt.show()
# Display the outputs
display(objective_explanation)
display(spotify5k_df[['name', 'objective_cluster']])
"Objective sentiment analysis was performed based on the following columns: 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. Three clusters were chosen for better interpretation: Low, Medium, and High."
| name | objective_cluster | |
|---|---|---|
| 0 | Se Eu Quiser Falar Com Deus ... | Medium |
| 1 | Saudade De Bahia ... | Medium |
| 2 | Canta Canta, Minha Gente ... | Low |
| 3 | Mulher Eu Sei ... | Low |
| 4 | Rosa Morena ... | Medium |
| ... | ... | ... |
| 5230 | 1812 Festival Overture, Op. 49: 1812 Overture,... | Low |
| 5231 | Winter Fragments pour ensemble instrumental, s... | Low |
| 5232 | Schoenberg: 5 Orchestral Pieces, Op. 16: No. 3... | Low |
| 5233 | Serenade For Strings In E, Op.22, B. 52: 1. Mo... | Low |
| 5234 | Ravel: Boléro, M. 81 ... | Low |
5235 rows × 2 columns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo']
# Explanation: These features are chosen because they represent different aspects of songs
print("Selected Features for Clustering:")
print(pd.DataFrame(features, columns=['Features']))
# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(spotify5k_df[features])
# Explanation: StandardScaler is used to standardize the features, ensuring each feature has a mean of 0 and a standard deviation of 1.
print("\nFeature Scaling:")
print("The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.")
# Create a DataFrame of scaled features for visualization
scaled_df = pd.DataFrame(X_scaled, columns=features)
# Plot histograms of scaled features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features):
plt.subplot(3, 3, i + 1)
plt.hist(scaled_df[feature], bins=20, color='skyblue', edgecolor='black')
plt.title(feature)
plt.xlabel('Scaled Values')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Use PCA to reduce dimensionality
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X_scaled)
# Explanation: PCA is used to reduce the dimensionality of the data while retaining most of its variance.
print("\nDimensionality Reduction with PCA:")
print("Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.")
# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(features) + 1), pca.explained_variance_ratio_, marker='o', linestyle='--', color='b')
plt.title('Explained Variance Ratio by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.xticks(np.arange(1, len(features) + 1))
plt.grid(False)
plt.show()
# Find optimal number of clusters using silhouette score
silhouette_scores = []
for k in range(2, 101):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca)
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
silhouette_scores.append(silhouette_avg)
# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 101), silhouette_scores, marker='o', linestyle='-', color='r')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(np.arange(2, 101, 5))
plt.grid(False)
plt.show()
# Based on business requirements, choose the number of clusters
# We aim to create playlists with sizes between 50 and 250 songs
# So, let's explore having between 20 and 100 clusters
print("\nChoosing Number of Clusters:")
print("Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs.")
print("We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.")
Selected Features for Clustering:
Features
0 danceability
1 energy
2 loudness
3 speechiness
4 acousticness
5 instrumentalness
6 liveness
7 valence
8 tempo
Feature Scaling:
The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.
Dimensionality Reduction with PCA: Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.
Choosing Number of Clusters: Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs. We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# Step 1: Determine the optimal number of clusters
# Calculate silhouette scores for different numbers of clusters
silhouette_scores = []
for k in range(2, 101):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca)
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
silhouette_scores.append(silhouette_avg)
# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 101), silhouette_scores, marker='o', linestyle='-', color='limegreen')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(np.arange(2, 101, 5))
plt.grid(False)
plt.show()
# Step 2: Perform K-means clustering with the chosen number of clusters
# Based on business requirements, let's choose the number of clusters
# We aim to create playlists with sizes between 50 and 250 songs
# So, let's explore having between 20 and 100 clusters
chosen_clusters = 50
# Perform K-means clustering
kmeans = KMeans(n_clusters=chosen_clusters, random_state=42)
kmeans.fit(X_pca)
# Assign songs to clusters
cluster_labels = kmeans.labels_
# Step 3: Analyze characteristics of each cluster and create playlists
# Visualize cluster sizes
plt.figure(figsize=(10, 6))
plt.hist(cluster_labels, bins=chosen_clusters, color='gold', edgecolor='black')
plt.title('Distribution of Songs Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Number of Songs')
plt.grid(False)
plt.show()
# Analyze characteristics of each cluster
cluster_centers = scaler.inverse_transform(pca.inverse_transform(kmeans.cluster_centers_))
cluster_df = pd.DataFrame(cluster_centers, columns=features)
# Display cluster characteristics
print("\nCluster Characteristics:")
print(cluster_df)
# Create playlists based on clusters
playlist_sizes = [50, 100, 150, 200, 250]
playlists = []
for size in playlist_sizes:
playlist = []
for i in range(chosen_clusters):
cluster_indices = np.where(cluster_labels == i)[0]
cluster_indices = np.random.choice(cluster_indices, min(size // chosen_clusters, len(cluster_indices)), replace=False)
playlist.extend(cluster_indices)
playlists.append(playlist)
# Visualize playlist sizes
plt.figure(figsize=(10, 6))
plt.bar(range(len(playlist_sizes)), [len(p) for p in playlists], color='skyblue', edgecolor='black')
plt.xticks(range(len(playlist_sizes)), [f"{size} Songs" for size in playlist_sizes])
plt.title('Playlist Sizes')
plt.xlabel('Playlist Size')
plt.ylabel('Number of Songs')
plt.grid(False)
plt.show()
# Final Output: A data-driven analysis on clustering music data to create playlists
print("\nData-Driven Playlist Creation:")
print("By applying K-means clustering to music data, we identified distinct clusters of songs.")
print("Each cluster represents songs with similar characteristics, allowing us to create diverse playlists.")
print("Our analysis ensures that each playlist falls within the desired size range, catering to various music preferences.")
Cluster Characteristics:
danceability energy loudness speechiness acousticness \
0 0.638241 0.543509 -9.420741 0.049454 0.217128
1 0.255956 0.953867 -6.552889 0.294956 0.018788
2 0.237559 0.134845 -20.194955 0.042655 0.918593
3 0.384051 0.784051 -5.999276 0.060602 0.065965
4 0.255372 0.918642 -8.244533 0.100722 0.009430
5 0.722199 0.822593 -5.170907 0.054779 0.084628
6 0.530379 0.585765 -7.275750 0.038811 0.118160
7 0.512322 0.189918 -14.514575 0.042064 0.862069
8 0.611485 0.796667 -6.480545 0.260727 0.164447
9 0.236984 0.953076 -6.970414 0.138273 0.004265
10 0.776301 0.662301 -6.842019 0.290340 0.195811
11 0.571887 0.505642 -10.295491 0.060089 0.609679
12 0.688940 0.796894 -6.125384 0.051253 0.148324
13 0.649836 0.729164 -5.785616 0.197658 0.223944
14 0.261514 0.158233 -19.581289 0.044287 0.927461
15 0.287964 0.944371 -6.620429 0.117826 0.007488
16 0.548741 0.614519 -8.860247 0.052907 0.608185
17 0.422893 0.152813 -17.881270 0.044634 0.935393
18 0.522704 0.494800 -8.605872 0.037519 0.607096
19 0.256256 0.899957 -7.829686 0.084520 0.022247
20 0.517682 0.815305 -6.025156 0.051307 0.079856
21 0.276156 0.947701 -6.949583 0.130681 0.005310
22 0.312911 0.050768 -24.245583 0.048975 0.932583
23 0.729367 0.652751 -6.501610 0.050899 0.102056
24 0.568820 0.809036 -5.338006 0.057978 0.078530
25 0.789089 0.754093 -5.462509 0.073350 0.157751
26 0.767610 0.787052 -5.132234 0.171831 0.174536
27 0.646381 0.615952 -7.402762 0.462476 0.204789
28 0.189143 0.923218 -8.187103 0.114483 0.000162
29 0.326286 0.176659 -21.674727 0.056445 0.926273
30 0.645920 0.284357 -14.288159 0.056819 0.788129
31 0.653902 0.835561 -5.648622 0.064650 0.053876
32 0.256405 0.021973 -32.966220 0.047760 0.963209
33 0.569960 0.752840 -5.628180 0.355360 0.128330
34 0.260148 0.872615 -9.842303 0.080007 0.005975
35 0.321013 0.203286 -14.384658 0.042971 0.911763
36 0.617460 0.784080 -7.638440 0.051896 0.099935
37 0.674254 0.611690 -8.368063 0.054069 0.655135
38 0.595532 0.806073 -5.977532 0.067370 0.122243
39 0.277600 0.835827 -10.160320 0.070545 0.014217
40 0.325106 0.046042 -28.753576 0.046194 0.966515
41 0.551570 0.410289 -10.231307 0.043446 0.640237
42 0.550944 0.861622 -5.162889 0.072799 0.066933
43 0.699763 0.753553 -5.486289 0.057467 0.108573
44 0.272695 0.927924 -6.761848 0.108619 0.014219
45 0.599190 0.345586 -14.190603 0.048495 0.811655
46 0.735399 0.536135 -10.632020 0.046663 0.192130
47 0.762082 0.725300 -5.641491 0.184091 0.190541
48 0.325622 0.265924 -14.424222 0.046704 0.779556
49 0.601290 0.846842 -4.467612 0.062983 0.049764
instrumentalness liveness valence tempo
0 0.012862 0.119604 0.610937 91.825902
1 0.391098 0.157518 0.089462 115.284311
2 0.857511 0.125135 0.103664 77.182169
3 0.057124 0.170864 0.418495 170.613582
4 0.844095 0.350912 0.204377 111.275489
5 0.017626 0.100758 0.812671 127.091134
6 0.018348 0.123937 0.224861 116.722008
7 0.043116 0.126871 0.268154 122.666184
8 0.027714 0.708424 0.510800 126.642636
9 0.787025 0.128012 0.143622 100.997127
10 0.001562 0.139392 0.662738 98.252214
11 0.022195 0.706925 0.617623 112.332472
12 0.010750 0.366053 0.801212 120.065166
13 0.016701 0.148638 0.656507 167.550219
14 0.846895 0.135797 0.154867 158.944316
15 0.090382 0.116717 0.190138 99.038614
16 0.025143 0.147107 0.826025 159.414086
17 0.871393 0.118960 0.160820 115.211041
18 0.018188 0.146095 0.423475 132.683832
19 0.731383 0.380600 0.310424 160.550414
20 0.043056 0.114947 0.558883 108.087292
21 0.100312 0.165424 0.168013 143.009142
22 0.071856 0.136039 0.163064 101.235722
23 0.017560 0.119760 0.489910 124.812768
24 0.032863 0.350497 0.390683 130.595293
25 0.004165 0.107227 0.772065 97.149201
26 0.020615 0.321896 0.665247 103.065013
27 0.006536 0.257362 0.502857 110.747524
28 0.807092 0.123032 0.201592 168.943759
29 0.497099 0.697045 0.223895 104.050136
30 0.058963 0.138916 0.624248 111.711876
31 0.749402 0.165323 0.291985 126.448341
32 0.874901 0.100595 0.112177 73.572308
33 0.018522 0.172696 0.531614 176.514880
34 0.816795 0.128945 0.259733 135.793615
35 0.042932 0.173722 0.199078 80.435184
36 0.735660 0.168884 0.805620 136.681920
37 0.026929 0.164922 0.820841 105.576421
38 0.022642 0.133181 0.779725 166.843037
39 0.780080 0.148558 0.317591 97.050187
40 0.898045 0.106336 0.182614 134.855894
41 0.025229 0.139161 0.405124 89.198860
42 0.125006 0.709822 0.458938 123.790489
43 0.008560 0.289750 0.478816 96.137842
44 0.166406 0.341876 0.236993 105.341238
45 0.823052 0.141048 0.537431 110.537931
46 0.014568 0.106582 0.848595 119.276284
47 0.004509 0.103830 0.610709 99.563218
48 0.076807 0.162613 0.327256 175.201756
49 0.032006 0.121711 0.307443 129.230525
Data-Driven Playlist Creation: By applying K-means clustering to music data, we identified distinct clusters of songs. Each cluster represents songs with similar characteristics, allowing us to create diverse playlists. Our analysis ensures that each playlist falls within the desired size range, catering to various music preferences.
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Selecting columns for clustering
columns_for_clustering = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
'duration_ms']
# Scaling the selected columns
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(spotify5k_df[columns_for_clustering])
# Performing K-means clustering with 50 clusters
kmeans = KMeans(n_clusters=50, random_state=42)
spotify5k_df['playlist_cluster'] = kmeans.fit_predict(scaled_features)
# Creating a mapping dictionary to map cluster labels to playlist names
playlist_names = [
"Relaxing Vibes", "Energetic Workout", "Chill Lounge", "Happy Beats", "Soothing Melodies",
"Motivational Mix", "Groovy Tunes", "Nighttime Chill", "Summer Fun", "Cozy Fireplace",
"Mellow Grooves", "High Tempo", "Zen Garden", "Romantic Serenade", "Sunny Day", "Late Night Jazz",
"Dreamy Atmosphere", "Pump-up Party", "Rainy Day", "Feel-good Funk", "Island Escapade",
"Classic Rock", "Jazzy Brunch", "Mindful Meditation", "Epic Soundtrack", "Stress Relief",
"Salsa Fiesta", "Soulful R&B", "Electronic Dance", "Country Roads", "Urban Vibes",
"Smooth Jazz", "Indie Discovery", "Reggae Vibes", "Guitar Strumming", "Vintage Classics",
"Latin Fever", "Deep House", "Piano Reflections", "Throwback Hits", "Alternative Edge",
"Calm Waters", "Disco Fever", "Motown Magic", "Folk Fusion", "Hip-hop Groove",
"Classical Symphony", "Techno Beats", "Opera Night", "Ambient Bliss"
]
playlist_mapping = {i: name for i, name in enumerate(playlist_names)}
# Mapping cluster labels to playlist names
spotify5k_df['playlist_name'] = spotify5k_df['playlist_cluster'].map(playlist_mapping)
# Define score ranges for each playlist type
mood_range = range(0, 100)
emotion_range = range(100, 200)
activity_range = range(200, max(playlist_summary['score'])+1)
# Assign playlist_type based on score
playlist_summary['playlist_type'] = ''
for idx, row in playlist_summary.iterrows():
if row['score'] in mood_range:
playlist_summary.at[idx, 'playlist_type'] = 'Mood'
elif row['score'] in emotion_range:
playlist_summary.at[idx, 'playlist_type'] = 'Emotion'
else:
playlist_summary.at[idx, 'playlist_type'] = 'Activity'
playlist_summary.sort_values(by=['score', 'playlist_type', 'name'], ascending=[False, True, True])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[32], line 43 41 mood_range = range(0, 100) 42 emotion_range = range(100, 200) ---> 43 activity_range = range(200, max(playlist_summary['score'])+1) 45 # Assign playlist_type based on score 46 playlist_summary['playlist_type'] = '' NameError: name 'playlist_summary' is not defined